# import the data that we've analyzed and prepared for model building
merged.data <- readRDS(file = "data/merged_data_prepared.RData")
# let's remind ourselves about the data structure
str(merged.data)


################################
######  MODEL BUILDING    ######
################################

# since our test dataset does not have values for the output variable (Survived),
# to build and evaluate a prediction model, we will partion the training part of 
# our merged dataset into two parts: 
# one for training and one for validation 
## <!explain the purpose of validation!>
# once we achieve sufficiently good prediction results on the validation set, 
# we can do the prediction on the test set and submit our results to the Kaggle website
# so, for now, we need only the first 891 instances from our merged dataset
# since these are the observations for which the outcome - Survived - is known
outcome.known.data <- merged.data[1:891,]

# we want to randomly select observations for training and validation;
# we also want to assure that the distribution of the output variable (Survived)
# is the same in both datasets (training and validation)
# to do that easily, we'll use appropriate functions from the caret package
# install.packages('caret')
library(caret)
# whenever working with random processes in R, we should first set the seed for 
# the random process so that we can replicate our results
set.seed(1410)
# we'll use 80% of all the observations for training and the rest for validation
train.index <- createDataPartition(outcome.known.data$Survived, p = .80,
                                   list = FALSE)
train <- outcome.known.data[train.index,]
validate <- outcome.known.data[-train.index,]
# check that the distribution of Survived is really the same in the two datasets
prop.table(table(train$Survived))
prop.table(table(validate$Survived))


# We'll create a prediction model using Decision Trees
# Hence, we need to load the rpart R package for working with decision trees
# note: this is just one of the available R packages for working with decision trees
# install.packages('rpart')
library(rpart)

# we fit the model using the rpart function
# by setting the value "class" of the method parameter we specify that we want to 
# use the majority vote principle for the class prediction
tree.model <- rpart(Survived ~ Pclass + Sex + Fare + Embarked + Title + AgeGender + FamilySize + TicketCount, 
                    data = train, method = "class")
# print the model
print(tree.model)

# let's plot the tree, to understand it better
library(rpart.plot)
par(mfrow = c(1,1))
prp(tree.model, type = 3, extra = 1)

# we can draw a more fancy plot of our tree model
# to do that, we will install some additional packages
# install.packages('rattle')
# install.packages('RColorBrewer')
library(rattle)
library(RColorBrewer)

fancyRpartPlot(tree.model)

# let's verify our model on the validation set
tree.predict <- predict(object = tree.model, newdata = validate, type = "class")
# see what the predictions look like
tree.predict[1:10]
table(tree.predict)
# we can create the confusion matrix 
## <!explain confusion matrix and the evaluation metrics!>
cm <- as.matrix ( table( true = validate$Survived, predicted = tree.predict ) )
cm

# since we'll need to compute evaluation metrics couple of times, it's handy to have
# a function for that; the f. receives the confusion matrix,
# and returns a named vector with the values for accuracy, precision, recall and F1-measure
compute.eval.measures <- function(cm) {
  n <- sum(cm)
  acc <- sum(diag(cm))/n
  
  # since we're interested in predicting survival, we'll compute precision and recall
  # using values of the confusion matrix that are related to the Yes outcome
  TP <- cm[2,2]
  TN <- cm[1,1]
  FP <- cm[1,2]
  FN <- cm[2,1]
  prec <- TP / (TP + FP) 
  rec <- TP / (TP + FN)
  f1 <- (2*prec*rec)/(prec+rec)
  # return the performance metrics
  c(accuracy = acc, precision = prec, recall = rec, F1 = f1)  
}

tree.model.eval <- compute.eval.measures(cm)
tree.model.eval

# the rpart function uses a number of parameters to control the growth of a tree
# (in the above call of the rpart f. we relied on the default values of those parameters)
# to inspect the parameters and their defaults, type: 
?rpart.control
# let's now change some of these parameters to try to create a better model
# for instance, we can change 
# - cp - it stops splits that aren’t deemed important enough (i.e., the splits that do not 
# increase the fittness of the model by at least a factor of cp)
# - minsplit - minimum number of instances in a node for a split to be attempted at that node
tree.model2 <- rpart(Survived ~ Pclass + Sex + Fare + Embarked + Title + AgeGender + FamilySize + TicketCount, 
                     data = train, method = "class", 
                     control=rpart.control(minsplit=15, cp=0.0025))

print(tree.model2)
prp(tree.model2, type = 3, extra = 1)
# the following command - for drawing the fancy tree plot - tend to crush R, 
# so, think twice before calling it 
# fancyRpartPlot(tree.model2)

# let's measure the model's performance on the training set
train.predict <- predict(object = tree.model2, newdata = train, type = "class")
train.cm <- as.matrix ( table( true = train$Survived, predicted = train.predict ) )
train.metrics <- compute.eval.measures(train.cm)
train.metrics

# let's now check the same metrics on the validation set
validate.predict <- predict(object = tree.model2, newdata = validate, type = "class")
validate.cm <- as.matrix ( table( true = validate$Survived, predicted = validate.predict ) )
validate.metrics <- compute.eval.measures(validate.cm)
validate.metrics

# we can create a data frame of the computed metrics to have a better view of the 
# metrics and how they differ between the training and the validation set
tree.model2.eval <- data.frame( rbind(train.metrics, validate.metrics), 
                                row.names = c("train", "validate") )
names(tree.model2.eval) <- c("accuracy", "precision", "recall", "F1")
require(knitr)
kable(tree.model2.eval, format = 'rst')
# significant drop in all the metrics
# obviously by growing more complex tree we overfitted the model to the training set
## <! explain over-fitting and under-fitting !>


# We'll now do cross-validation to determine optimal CP, i.e., the CP value
# that results in the lowest error (or highest model accuracy) on the validation set
## <!explain cross-validation!>

# install cross-validation packages
# install.packages("e1071")
library(e1071)

# define cross-validation parameters
numFolds = trainControl( method = "cv", number = 10 )
# we define here the range of the cp values to examine in the cross-validation
cpGrid = expand.grid( .cp = seq(from = 0.005, to = 0.5, by = 0.005)) 
# perform the cross validation
set.seed(1410)
train(Survived ~ Pclass + Sex + Fare + Embarked + Title + AgeGender + FamilySize + TicketCount,
      data = train, method = "rpart", 
      trControl = numFolds, tuneGrid = cpGrid )

# as the optimal CP value, we'll choose the one that is associated with both
# high accuracy and not-overly complex tree (i.e., the cp value that is not overly small)
# after we choose the optimal value for the CP parameter, we can now prune the tree 
# to improve its performance
pruned.tree.model2 <- prune(tree = tree.model2, cp = 0.05)

prp(pruned.tree.model2, type = 3, extra = 1)
# draw a fancy tree plot
fancyRpartPlot( pruned.tree.model2 )

# let's check how it performs on the validation set
val.pred.pruned <- predict(object = pruned.tree.model2, newdata = validate, type = "class")
val.pruned.cm <- as.matrix ( table( true = validate$Survived, predicted = val.pred.pruned ) )
val.pruned.cm
val.pruned.metrics <- compute.eval.measures(val.pruned.cm)
val.pruned.metrics
# let's compare this model with the initila one
tree.model.eval
# apart from precision, all other metrics are better, so, 
# tuning the model through cross-validation really helped us get a better model